rm(list = ls())
##################################
##                              ##
##         Diagnostics          ##
##                              ##
##################################

# Top sentences for each style

load("working/dictionaries_sentence.Rdata") # Sentence-level complexity scores
load("working/complexity_sentence.Rdata") # Sentence-level complexity scores
load("working/repetition_sentence.Rdata") # Sentence-level repetition scores
load("working/speech_scores.Rdata")
load("data/debates.Rdata") # Raw debate information

stdize <- function(x) (x - mean(x, na.rm = T))/sd(x, na.rm = T)
library(plm)

complexity_scores$complexity_std <- stdize(complexity_scores$complexity)
dictionary_scores$complexity_std <- complexity_scores$complexity_std
dictionary_scores$repetition_std <- repetition_scores$repetition_std

dictionary_scores <- merge(dictionary_scores, debates[,c("is_speaker","epobject_id")], by = "epobject_id")

dictionary_scores[,glove_affect_prop:=glove_affect/n_words,] 
dictionary_scores[,glove_posemo_prop:=glove_posemo/n_words] 
dictionary_scores[,glove_negemo_prop:=glove_negemo/n_words] 
dictionary_scores[,glove_fact_prop:=glove_fact/n_words] 
dictionary_scores[,glove_anecdote_prop:=glove_anecdote/n_words] 
dictionary_scores[,glove_aggression_prop:=glove_aggression/n_words] 

min_words <- 20
max_words <- 120
dictionary_scores_tmp <- dictionary_scores[n_words>=min_words & n_words <= max_words]
dictionary_scores_tmp <- dictionary_scores_tmp[is_speaker == F]

top_affect <- dictionary_scores_tmp[order(glove_affect_prop, decreasing = T)[1:150]]
top_posemo <- dictionary_scores_tmp[order(glove_posemo_prop, decreasing = T)[1:150]]
top_negemo <- dictionary_scores_tmp[order(glove_negemo_prop, decreasing = T)[1:150]]
top_fact <- dictionary_scores_tmp[order(glove_fact_prop, decreasing = T)[1:150]]
top_anecdote <- dictionary_scores_tmp[order(glove_anecdote_prop, decreasing = T)[1:150]]
top_aggression <- dictionary_scores_tmp[order(glove_aggression_prop, decreasing = T)[1:150]]
top_complexity <- dictionary_scores_tmp[order(complexity_std, decreasing = T)[1:150]]
top_repetition <- dictionary_scores_tmp[order(repetition_std, decreasing = T)[1:150]]

top_affect <- top_affect[!duplicated(top_affect$sent)]
top_posemo <- top_posemo[!duplicated(top_posemo$sent)]
top_negemo <- top_negemo[!duplicated(top_negemo$sent)]
top_fact <- top_fact[!duplicated(top_fact$sent)]
top_anecdote <- top_anecdote[!duplicated(top_anecdote$sent)]
top_aggression <- top_aggression[!duplicated(top_aggression$sent)]
top_complexity <- top_complexity[!duplicated(top_complexity$sent)]
top_repetition <- top_repetition[!duplicated(top_repetition$sent)]

save(top_affect, top_posemo, top_negemo, top_fact, top_anecdote, top_aggression, top_repetition, top_complexity, file = "working/top_sentences.Rdata")


##################################
##                              ##
##         Analysis             ##
##                              ##
##################################


## Debate type models

style_types <- c("affect_std","posemo_std", "negemo_std", "fact_std", "anecdote_std", "aggression_std", "complexity_std", "repetition_std")

style_type_labels <- c("Emotion", "Pos. Emotion", "Neg. Emotion", "Fact", "Anecdote", "Aggression", "Complexity", "Repetition")

covariate_formula <- "~ gender + party + age_years + attends_cabinet + attends_shadow_cabinet + is_gov_minister + is_opp_minister + is_committee_chair + years_in_parliament"

covariate_labels <- c("Intercept", "Female", "Party: Green", "Party: Labour", 
                      "Party: Lib Dem", "Party: Other", 
                      "Party: SNP", "Age (Years)", "Attends Cabinet", "Attends Shadow Cabinet", 
                      "Gov Minister", "Shadow Minister", "Committee Chair",
                      "Years in Parliament")

est_conf_ints <- function(x = bivariate_model){
  
  coefs <- coef(summary(x))
  est <- coefs[,1]
  se <- coefs[,2]
  hi <- est + 1.96 * se
  lo <- est - 1.96 * se
  return(data.frame(est, hi, lo))
}

coef_list <- list()
all_list <- list()
questions_list <- list()
PMQs_list <- list()
procedure_list <- list()
legislation_list <- list()
opp_bbb_list <- list()
other_list <- list()

i<-0

for(style in style_types){
  
  print(style)
  i <- i+1
  
  all_model <- lm(as.formula(paste0(style, "~ gender")), 
                  data = speech_scores, 
                  weights = speech_scores$n_words)
  
  questions_model <- lm(as.formula(paste0(style, "~ gender")), 
                        data = speech_scores[speech_scores$debate_type=="Questions"], 
                        weights = speech_scores$n_words[speech_scores$debate_type=="Questions"])
  
  PMQs_model <- lm(as.formula(paste0(style, "~ gender")), 
                   data = speech_scores[speech_scores$debate_type=="PMQs"], 
                   weights = speech_scores$n_words[speech_scores$debate_type=="PMQs"])
  
  procedure_model <- lm(as.formula(paste0(style, "~ gender")), 
                        data = speech_scores[speech_scores$debate_type=="Procedure"], 
                        weights = speech_scores$n_words[speech_scores$debate_type=="Procedure"])
  
  legislation_model <- lm(as.formula(paste0(style, "~ gender")), 
                          data = speech_scores[speech_scores$debate_type=="Legislation"], 
                          weights = speech_scores$n_words[speech_scores$debate_type=="Legislation"])
  
  opp_bbb_model <- lm(as.formula(paste0(style, "~ gender")), 
                      data = speech_scores[speech_scores$debate_type=="Opposition/Backbench"], 
                      weights = speech_scores$n_words[speech_scores$debate_type=="Opposition/Backbench"])
  
  other_model <- lm(as.formula(paste0(style, "~ gender")), 
                    data = speech_scores[speech_scores$debate_type=="Other"], 
                    weights = speech_scores$n_words[speech_scores$debate_type=="Other"])
  
  out <- list()
  out$all <- est_conf_ints(all_model)
  out$questions <- est_conf_ints(questions_model)
  out$PMQs <- est_conf_ints(PMQs_model)
  out$procedure <- est_conf_ints(procedure_model)
  out$legislation <- est_conf_ints(legislation_model)
  out$opp_bbb <- est_conf_ints(opp_bbb_model)
  out$other <- est_conf_ints(other_model)
  
  coef_list[[i]] <- out
  
}

names(coef_list) <- style_types

coef_list_debate_models <- coef_list

save(coef_list_debate_models, file = "working/debate_model_out.Rdata")


## Output manually constructed dictionaries

anecdote_words <- read.csv("data/dictionaries/seed_words_anecdote.csv", stringsAsFactors = F)[,1]

sink("analysis/anecdote_words.txt")
cat(paste0(anecdote_words, collapse = "; "))
sink()

aggression_words <- read.csv("data/dictionaries/LH_aggression_seed.csv", stringsAsFactors = F)[,1]

sink("analysis/aggression_words.txt")
cat(paste0(aggression_words, collapse = "; "))
sink()



library(margins) # v.0.3.26
library(tidyverse) # v.1.3.0

style_types <- c("affect_std","posemo_std", "negemo_std", "fact_std", "anecdote_std", "aggression_std", "complexity_std", "repetition_std")

style_type_labels <- c("Affect", "Pos. Emotion", "Neg. Emotion", "Fact", "Human Narrative", "Aggression", "Complexity", "Repetition")

margins_out_list <- list()
i<-1
for(i in 1:length(style_types)){
  print(style_types[i])
  
  x <- speech_scores[,list(N = .N,
                           y = mean(get(style_types[i])),
                           gender = unique(gender)),
                     by = list(person_id, parliamentary_term)]
  
  x[,y:=scale(y), by = parliamentary_term]
  
  mod <- lm(N ~ y * parliamentary_term * gender, data.frame(x))
  margins_out <- margins::margins(mod, 
                                  at = list(parliamentary_term = unique(x$parliamentary_term),
                                         gender = c("Male", "Female")),
                          variables = "y")
  
  
  margins_out <- margins_out %>% summary() %>% as_tibble() %>% filter(!is.na(AME)) %>% as.data.frame()
  
  margins_out$var <- style_type_labels[i]
  
  margins_out_list[[i]] <- margins_out 
  
}

margins_out <- do.call("rbind", margins_out_list)

margins_out$var <- factor(margins_out$var, levels = c("Human Narrative", "Affect", "Pos. Emotion", "Neg. Emotion", "Fact", "Aggression", "Complexity", "Repetition"))

lim <- max(abs(c(margins_out$lower, margins_out$upper)))
lims <- c(-lim, lim)

p1 <- margins_out %>%
  ggplot(aes(x = parliamentary_term, y = AME, ymin = lower, ymax = upper, col = gender, linetype = gender)) + geom_errorbar(width = .01, position = position_dodge(.2)) + 
  geom_point(position = position_dodge(.2)) + theme_bw() +
  geom_hline(yintercept = 0, linetype = 2) + xlab("") + 
  ylab("Effect of average style use on speech count") + 
  scale_color_manual(values = c("black", "red")) + 
  theme(legend.title = element_blank(),
        axis.text.x = element_text(angle = 90, vjust = 0, hjust=1)) + 
  facet_wrap(~var, nrow = 2, ncol = 4) + ylim(lims)

pdf("analysis/plots/style_use_speech_rate.pdf",12,8)
print(p1)
dev.off()
